Explore some GAM models
Five minute H2S
h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## I(1/MinDist^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.173e+00 3.209e-03 365.68 <2e-16 ***
## year2023 -3.275e-01 2.799e-03 -117.01 <2e-16 ***
## wd_secQ2 -2.566e-01 3.748e-03 -68.47 <2e-16 ***
## wd_secQ3 -2.906e-01 3.889e-03 -74.74 <2e-16 ***
## wd_secQ4 -2.103e-01 3.489e-03 -60.27 <2e-16 ***
## ws -5.429e-02 4.174e-04 -130.06 <2e-16 ***
## I(1/MinDist^2) 2.115e+05 2.329e+03 90.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.996 8 2499 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0948 Deviance explained = 9.48%
## GCV = 0.92955 Scale est. = 0.92953 n = 772917
plot(h2s_model_a)

gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4619032 246.7 8761935 468.0 7327279 391.4
## Vcells 104368480 796.3 272043532 2075.6 226506938 1728.2
h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
s(mon_utm_x, mon_utm_y, bs='tp', k = 8),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) +
## s(mon_utm_x, mon_utm_y, bs = "tp", k = 8)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.014e+00 3.167e-03 320.23 <2e-16 ***
## wd_secQ2 -1.962e-01 3.692e-03 -53.14 <2e-16 ***
## wd_secQ3 -1.830e-01 3.900e-03 -46.92 <2e-16 ***
## wd_secQ4 -1.139e-01 3.482e-03 -32.71 <2e-16 ***
## ws -6.928e-02 4.144e-04 -167.17 <2e-16 ***
## I(1/MinDist^2) 2.989e+05 2.856e+03 104.67 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.999 8 1711 <2e-16 ***
## s(mon_utm_x,mon_utm_y) 6.999 7 6597 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.131 Deviance explained = 13.1%
## GCV = 0.89269 Scale est. = 0.89267 n = 772917
plot(h2s_model_b)


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4632709 247.5 8761935 468.0 7327279 391.4
## Vcells 114894044 876.6 326532238 2491.3 317817903 2424.8
# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws +
I(1/MinDist^2) + Converted_Angle +
s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
as.factor(weekday),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## I(1/MinDist^2) + Converted_Angle + s(mon_utm_x, mon_utm_y,
## bs = "tp", k = 10) + as.factor(weekday)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.496e+00 2.319e-02 107.606 < 2e-16 ***
## year2023 -3.332e-01 2.865e-03 -116.278 < 2e-16 ***
## wd_secQ2 -1.871e-01 3.641e-03 -51.379 < 2e-16 ***
## wd_secQ3 -1.928e-01 3.843e-03 -50.165 < 2e-16 ***
## wd_secQ4 -1.104e-01 3.439e-03 -32.108 < 2e-16 ***
## ws -6.128e-02 4.078e-04 -150.288 < 2e-16 ***
## I(1/MinDist^2) -3.307e-05 3.303e-07 -100.112 < 2e-16 ***
## Converted_Angle -6.129e-03 1.103e-04 -55.545 < 2e-16 ***
## as.factor(weekday).L 9.819e-02 2.801e-03 35.048 < 2e-16 ***
## as.factor(weekday).Q -1.708e-01 2.808e-03 -60.809 < 2e-16 ***
## as.factor(weekday).C -2.851e-03 2.803e-03 -1.017 0.309
## as.factor(weekday)^4 -2.022e-02 2.815e-03 -7.185 6.72e-13 ***
## as.factor(weekday)^5 -5.940e-02 2.806e-03 -21.172 < 2e-16 ***
## as.factor(weekday)^6 -2.384e-02 2.819e-03 -8.457 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.998 8 2708 <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.999 9 6521 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 30/31
## R-sq.(adj) = 0.156 Deviance explained = 15.6%
## GCV = 0.86676 Scale est. = 0.86672 n = 772917
plot(h2s_model_c)


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4634573 247.6 8761935 468.0 7327279 391.4
## Vcells 126927999 968.4 391918685 2990.2 379825124 2897.9
# Include converted angle and weekday
h2s_model_d <- gam(H2S ~
s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_sec + ws + downwind_ref + downwind_wrp +
I(1/MinDist^2) + dist_wrp + capacity +
Converted_Angle +
s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km
,
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## dist_wrp + capacity + Converted_Angle + s(mon_utm_x, mon_utm_y,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.642e+00 5.385e-02 -49.056 < 2e-16 ***
## year2023 -1.044e-01 6.558e-03 -15.914 < 2e-16 ***
## as.factor(weekday).L 1.077e-01 2.988e-03 36.055 < 2e-16 ***
## as.factor(weekday).Q -1.851e-01 2.993e-03 -61.867 < 2e-16 ***
## as.factor(weekday).C 1.469e-04 2.978e-03 0.049 0.96066
## as.factor(weekday)^4 -1.439e-02 2.982e-03 -4.824 1.41e-06 ***
## as.factor(weekday)^5 -6.110e-02 2.971e-03 -20.566 < 2e-16 ***
## as.factor(weekday)^6 -2.740e-02 2.982e-03 -9.188 < 2e-16 ***
## wd_secQ2 -1.532e-01 3.930e-03 -38.984 < 2e-16 ***
## wd_secQ3 -1.332e-01 4.153e-03 -32.078 < 2e-16 ***
## wd_secQ4 -7.319e-02 3.671e-03 -19.937 < 2e-16 ***
## ws -7.080e-02 4.332e-04 -163.434 < 2e-16 ***
## downwind_ref 1.429e-01 4.128e-03 34.614 < 2e-16 ***
## downwind_wrp -1.389e-02 4.573e-03 -3.038 0.00238 **
## I(1/MinDist^2) 3.304e-05 1.031e-06 32.048 < 2e-16 ***
## dist_wrp 4.645e-04 9.171e-06 50.643 < 2e-16 ***
## capacity 5.504e-03 8.696e-05 63.294 < 2e-16 ***
## Converted_Angle -2.974e-03 1.184e-04 -25.115 < 2e-16 ***
## monthly_oil_1km 6.487e-05 3.240e-06 20.023 < 2e-16 ***
## monthly_gas_1km 2.866e-04 1.230e-05 23.298 < 2e-16 ***
## active_1km -3.712e-02 1.334e-03 -27.827 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.997 8.000 1017 <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.007 8.007 5214 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 36/38
## R-sq.(adj) = 0.162 Deviance explained = 16.2%
## GCV = 0.89963 Scale est. = 0.89958 n = 712259
plot(h2s_model_d)


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4635468 247.6 8761935 468.0 7327279 391.4
## Vcells 141999864 1083.4 470382422 3588.8 391840534 2989.6
# Include elvation, EVI, 3D smooth, odor, dist to dom channel, temp, hum, precip
h2s_model_e <- readRDS('rfiles/h2s_model_e.rds')
# h2s_model_e <- gam(H2S ~
# s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
# wd_sec + ws + downwind_ref + downwind_wrp +
# I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
# Converted_Angle + elevation + EVI +
# s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
# te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
# monthly_oil_1km + monthly_gas_1km + active_1km +
# num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
# ,
# data = full_data %>% filter(day >= '2022-02-01'))
#saveRDS(h2s_model_e, 'rfiles/h2s_model_e.rds')
summary(h2s_model_e)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle +
## elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.285e+00 6.510e-01 3.510 0.000449 ***
## year2023 9.870e-02 1.712e-02 5.766 8.12e-09 ***
## as.factor(weekday).L 1.129e-01 2.852e-03 39.594 < 2e-16 ***
## as.factor(weekday).Q -1.747e-01 2.841e-03 -61.484 < 2e-16 ***
## as.factor(weekday).C 1.584e-02 2.824e-03 5.609 2.03e-08 ***
## as.factor(weekday)^4 -1.810e-02 2.828e-03 -6.400 1.55e-10 ***
## as.factor(weekday)^5 -5.583e-02 2.808e-03 -19.883 < 2e-16 ***
## as.factor(weekday)^6 -2.485e-02 2.822e-03 -8.806 < 2e-16 ***
## wd_secQ2 -1.031e-01 3.741e-03 -27.569 < 2e-16 ***
## wd_secQ3 -1.454e-01 3.962e-03 -36.702 < 2e-16 ***
## wd_secQ4 -1.146e-01 3.490e-03 -32.841 < 2e-16 ***
## ws -6.036e-02 4.158e-04 -145.169 < 2e-16 ***
## downwind_ref 1.098e-01 3.923e-03 28.001 < 2e-16 ***
## downwind_wrp 8.079e-03 4.326e-03 1.868 0.061802 .
## I(1/MinDist^2) 1.707e-05 6.264e-06 2.726 0.006420 **
## I(1/dist_wrp^2) 1.056e-06 3.794e-07 2.783 0.005394 **
## capacity 7.434e-03 1.338e-03 5.556 2.75e-08 ***
## wrp_angle 4.063e-03 9.670e-04 4.201 2.65e-05 ***
## Converted_Angle -1.586e-02 2.157e-03 -7.352 1.95e-13 ***
## elevation -2.059e-01 1.293e-02 -15.920 < 2e-16 ***
## EVI 2.975e+00 6.113e-01 4.868 1.13e-06 ***
## monthly_oil_1km 1.856e-04 6.710e-06 27.663 < 2e-16 ***
## monthly_gas_1km -4.097e-04 3.277e-05 -12.504 < 2e-16 ***
## active_1km -3.057e-02 2.530e-03 -12.081 < 2e-16 ***
## num_odor_complaints 2.821e-02 1.917e-03 14.718 < 2e-16 ***
## I(1/dist_dc^2) 1.019e-04 9.015e-06 11.303 < 2e-16 ***
## avg_temp 9.080e-03 4.090e-04 22.202 < 2e-16 ***
## avg_hum -7.505e-03 1.056e-04 -71.073 < 2e-16 ***
## precip -8.285e-02 5.145e-03 -16.102 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.994 8.000 310.126
## s(mon_utm_x,mon_utm_y) 1.865 1.865 0.459
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 81.249 81.253 899.338
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(mon_utm_x,mon_utm_y) 0.542
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/135
## R-sq.(adj) = 0.255 Deviance explained = 25.5%
## GCV = 0.80029 Scale est. = 0.80016 n = 712259
plot(h2s_model_e)



gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4643231 248.0 8761935 468.0 7327279 391.4
## Vcells 180130953 1374.3 470382422 3588.8 391840534 2989.6
# Model only the month of the event, October 2021
h2s_model_f <- readRDS('rfiles/h2s_model_f.rds')
# h2s_model_f <- gam(H2S ~ wd_sec + ws + downwind_ref + downwind_wrp +
# I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
# Converted_Angle + elevation + EVI +
# s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
# te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
# monthly_oil_1km + monthly_gas_1km + active_1km +
# num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip,
# data = full_data %>% filter(year == '2021', month == '10'))
# saveRDS(h2s_model_f, 'rfiles/h2s_model_f.rds')
summary(h2s_model_f)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle +
## elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.099e-07 4.542e-07 2.003 0.04516 *
## wd_secQ2 3.129e+00 9.512e-01 3.289 0.00101 **
## wd_secQ3 6.611e+00 9.439e-01 7.004 2.50e-12 ***
## wd_secQ4 7.882e+00 8.910e-01 8.845 < 2e-16 ***
## ws -2.321e+00 1.159e-01 -20.026 < 2e-16 ***
## downwind_ref 1.639e+00 8.589e-01 1.908 0.05642 .
## downwind_wrp 1.494e+00 1.077e+00 1.388 0.16529
## I(1/MinDist^2) -4.561e-03 1.475e-03 -3.093 0.00198 **
## I(1/dist_wrp^2) -9.325e-04 5.499e-05 -16.959 < 2e-16 ***
## capacity 7.671e-01 1.692e-01 4.533 5.82e-06 ***
## wrp_angle 8.449e-01 1.825e-01 4.629 3.68e-06 ***
## Converted_Angle -2.556e+00 3.095e-01 -8.258 < 2e-16 ***
## elevation -2.652e+01 2.679e+00 -9.899 < 2e-16 ***
## EVI 8.337e+02 1.155e+02 7.220 5.23e-13 ***
## monthly_oil_1km 1.533e-02 7.959e-03 1.926 0.05412 .
## monthly_gas_1km 2.295e-03 1.205e-03 1.905 0.05680 .
## active_1km 3.981e-05 2.000e-05 1.991 0.04647 *
## num_odor_complaints 8.458e-01 6.767e-02 12.498 < 2e-16 ***
## I(1/dist_dc^2) 7.748e+00 3.991e-01 19.415 < 2e-16 ***
## avg_temp 1.349e+00 1.864e-01 7.236 4.66e-13 ***
## avg_hum 1.629e-01 5.258e-02 3.099 0.00194 **
## precip -4.019e+01 7.628e+00 -5.269 1.37e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(mon_utm_x,mon_utm_y) 2.017 2.02 4.625
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 84.337 84.58 61.960
## p-value
## s(mon_utm_x,mon_utm_y) 0.00868 **
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 108/120
## R-sq.(adj) = 0.265 Deviance explained = 26.5%
## GCV = 5448.6 Scale est. = 5441.4 n = 77510
plot(h2s_model_f)


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4644754 248.1 8761935 468.0 7327279 391.4
## Vcells 184117355 1404.8 470382422 3588.8 391840534 2989.6
# Model data excluding the months of the event
h2s_model_g <- readRDS('rfiles/h2s_model_g.rds')
# h2s_model_g <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
# wd_sec + ws + downwind_ref + downwind_wrp +
# I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
# Converted_Angle + elevation + EVI +
# s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
# te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
# monthly_oil_1km + monthly_gas_1km + active_1km +
# num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
# ,
# data = full_data %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
# saveRDS(h2s_model_g, 'rfiles/h2s_model_g.rds')
summary(h2s_model_g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle +
## elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.539e+00 3.610e-01 15.343 < 2e-16 ***
## year2021 3.707e-01 1.375e-02 26.967 < 2e-16 ***
## year2022 -5.225e-01 2.573e-02 -20.307 < 2e-16 ***
## year2023 -6.492e-01 2.547e-02 -25.492 < 2e-16 ***
## as.factor(weekday).L 8.394e-02 2.184e-03 38.426 < 2e-16 ***
## as.factor(weekday).Q -1.356e-01 2.182e-03 -62.151 < 2e-16 ***
## as.factor(weekday).C 1.406e-03 2.184e-03 0.644 0.5197
## as.factor(weekday)^4 -2.889e-02 2.188e-03 -13.204 < 2e-16 ***
## as.factor(weekday)^5 -1.732e-02 2.181e-03 -7.943 1.98e-15 ***
## as.factor(weekday)^6 -5.050e-03 2.184e-03 -2.312 0.0208 *
## wd_secQ2 -1.049e-01 2.952e-03 -35.543 < 2e-16 ***
## wd_secQ3 -1.878e-01 2.958e-03 -63.463 < 2e-16 ***
## wd_secQ4 -1.651e-01 2.796e-03 -59.057 < 2e-16 ***
## ws -4.613e-02 3.094e-04 -149.116 < 2e-16 ***
## downwind_ref 3.540e-02 2.614e-03 13.545 < 2e-16 ***
## downwind_wrp 2.151e-02 3.270e-03 6.577 4.80e-11 ***
## I(1/MinDist^2) -4.864e-05 8.792e-06 -5.533 3.15e-08 ***
## I(1/dist_wrp^2) 1.483e-06 9.456e-07 1.568 0.1168
## capacity 5.414e-03 5.868e-04 9.225 < 2e-16 ***
## wrp_angle 3.176e-03 7.460e-04 4.258 2.06e-05 ***
## Converted_Angle -1.473e-02 1.020e-03 -14.451 < 2e-16 ***
## elevation -1.200e-01 9.824e-03 -12.215 < 2e-16 ***
## EVI 2.459e+00 4.676e-01 5.259 1.44e-07 ***
## monthly_oil_1km -2.575e-05 2.232e-06 -11.534 < 2e-16 ***
## monthly_gas_1km -1.831e-04 9.137e-06 -20.036 < 2e-16 ***
## active_1km -3.881e-02 9.430e-04 -41.155 < 2e-16 ***
## num_odor_complaints 2.632e-02 1.466e-03 17.949 < 2e-16 ***
## I(1/dist_dc^2) 2.766e-03 3.971e-03 0.696 0.4861
## avg_temp 6.183e-03 2.696e-04 22.937 < 2e-16 ***
## avg_hum -6.301e-03 7.620e-05 -82.688 < 2e-16 ***
## precip -7.811e-02 5.252e-03 -14.872 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.999 8.000 2547.0
## s(mon_utm_x,mon_utm_y) 1.682 1.683 246.9
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 85.198 85.296 1455.0
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(mon_utm_x,mon_utm_y) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 129/137
## R-sq.(adj) = 0.166 Deviance explained = 16.6%
## GCV = 1.1485 Scale est. = 1.1484 n = 1696814
plot(h2s_model_g)



gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4647230 248.2 8761935 468.0 7327279 391.4
## Vcells 274409002 2093.6 470382422 3588.8 391840534 2989.6
Daily max H2S
# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
I(1/MinDist^2),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg +
## ws_avg + I(1/MinDist^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.020e+00 3.464e-01 8.718 <2e-16 ***
## year2023 -4.317e-01 2.195e-01 -1.967 0.0493 *
## wd_avg 6.905e-04 1.032e-03 0.669 0.5034
## ws_avg -8.775e-02 6.334e-02 -1.385 0.1661
## I(1/MinDist^2) 7.628e-07 8.197e-08 9.306 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.185 8 2.019 6.71e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 12/13
## R-sq.(adj) = 0.0087 Deviance explained = 1.06%
## GCV = 21.435 Scale est. = 21.386 n = 2667
plot(h2s_dm_model_a)

# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
I(1/MinDist^2)+Refinery+s(monitor_lat, monitor_lon, bs='tp', k=10),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg +
## I(1/MinDist^2) + Refinery + s(monitor_lat, monitor_lon, bs = "tp",
## k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.409e+00 1.421e+01 0.662 0.50803
## wd_avg 3.025e-03 1.007e-03 3.003 0.00269 **
## ws_avg -3.111e-01 6.322e-02 -4.921 9.14e-07 ***
## I(1/MinDist^2) -6.196e-05 1.642e-03 -0.038 0.96991
## RefineryMarathon (Carson) -8.957e-01 1.763e+01 -0.051 0.95948
## RefineryMarathon (Wilmington) -3.725e+00 1.747e+01 -0.213 0.83120
## RefineryPhillips 66 (Wilmington) -1.409e+01 1.628e+01 -0.865 0.38708
## RefineryTorrance Refinery -2.584e+00 1.402e+01 -0.184 0.85377
## RefineryValero Refinery -7.558e+00 1.768e+01 -0.428 0.66901
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.036 8.000 1.725 0.000201 ***
## s(monitor_lat,monitor_lon) 5.380 5.673 9.290 8.68e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 25/26
## R-sq.(adj) = 0.127 Deviance explained = 13.2%
## GCV = 18.946 Scale est. = 18.836 n = 2667
plot(h2s_dm_model_b)


# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
I(1/MinDist^2)+Converted_Angle+
s(monitor_lat, monitor_lon, bs='tp', k=10),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg +
## I(1/MinDist^2) + Converted_Angle + s(monitor_lat, monitor_lon,
## bs = "tp", k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.607e+00 8.814e-01 4.092 4.4e-05 ***
## wd_avg 2.917e-03 1.011e-03 2.884 0.00395 **
## ws_avg -2.238e-01 6.195e-02 -3.612 0.00031 ***
## I(1/MinDist^2) -8.262e-06 3.830e-05 -0.216 0.82923
## Converted_Angle -3.430e-03 3.891e-03 -0.882 0.37805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.114 8.000 2.046 4.81e-05 ***
## s(monitor_lat,monitor_lon) 7.081 7.926 41.174 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 21/22
## R-sq.(adj) = 0.117 Deviance explained = 12.1%
## GCV = 19.148 Scale est. = 19.053 n = 2667
plot(h2s_dm_model_c)


h2s_dm_model_d <- gam(H2S_daily_max ~
s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/MinDist^2) + dist_wrp + capacity +
Converted_Angle +
s(monitor_lat, monitor_lon, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km
,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + dist_wrp +
## capacity + Converted_Angle + s(monitor_lat, monitor_lon,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.136e+01 4.634e+00 -2.452 0.014273 *
## year2023 -4.471e-01 4.297e-01 -1.040 0.298239
## as.factor(weekday).L 4.596e-01 2.417e-01 1.901 0.057365 .
## as.factor(weekday).Q -9.234e-01 2.421e-01 -3.814 0.000140 ***
## as.factor(weekday).C -8.155e-03 2.404e-01 -0.034 0.972936
## as.factor(weekday)^4 -1.166e-01 2.399e-01 -0.486 0.627170
## as.factor(weekday)^5 -3.736e-01 2.388e-01 -1.564 0.117844
## as.factor(weekday)^6 -1.931e-01 2.394e-01 -0.807 0.419866
## wd_avg 3.799e-03 1.098e-03 3.459 0.000551 ***
## ws_avg -3.497e-01 6.974e-02 -5.014 5.73e-07 ***
## daily_downwind_ref 7.552e-01 4.163e-01 1.814 0.069803 .
## I(1/MinDist^2) 1.152e-04 4.177e-04 0.276 0.782730
## dist_wrp 2.158e-03 7.129e-04 3.027 0.002500 **
## capacity 9.067e-03 1.252e-02 0.724 0.469156
## Converted_Angle 6.465e-03 9.437e-03 0.685 0.493379
## monthly_oil_1km -5.757e-05 1.609e-04 -0.358 0.720447
## monthly_gas_1km -2.126e-04 6.315e-04 -0.337 0.736385
## active_1km 4.756e-03 4.832e-02 0.098 0.921606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 1.721 8.000 0.912 0.00637 **
## s(monitor_lat,monitor_lon) 7.785 7.976 13.083 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 34/35
## R-sq.(adj) = 0.137 Deviance explained = 14.6%
## GCV = 20.162 Scale est. = 19.942 n = 2431
plot(h2s_dm_model_d)


# Try to include as many variables as possible
h2s_dm_model_e <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
capacity + daily_downwind_wrp +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_e)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km +
## elevation + EVI + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.916e+00 3.078e+00 2.896 0.003811 **
## year2023 -4.825e-01 4.432e-01 -1.089 0.276410
## as.character(weekday)Mon -7.024e-01 3.365e-01 -2.088 0.036932 *
## as.character(weekday)Sat -7.705e-01 3.426e-01 -2.249 0.024595 *
## as.character(weekday)Sun -1.300e+00 3.422e-01 -3.800 0.000149 ***
## as.character(weekday)Thu -3.722e-01 3.406e-01 -1.093 0.274635
## as.character(weekday)Tue -1.518e-01 3.350e-01 -0.453 0.650577
## as.character(weekday)Wed -4.269e-02 3.405e-01 -0.125 0.900238
## wd_avg 2.712e-03 1.129e-03 2.401 0.016405 *
## ws_avg -2.786e-01 7.195e-02 -3.872 0.000111 ***
## daily_downwind_ref 5.252e-01 4.120e-01 1.275 0.202453
## I(1/dist_wrp^2) -1.689e-06 3.125e-07 -5.403 7.22e-08 ***
## capacity -3.644e-03 3.724e-03 -0.978 0.327933
## daily_downwind_wrp 1.881e-01 4.335e-01 0.434 0.664394
## monthly_oil_1km -7.268e-05 1.628e-04 -0.446 0.655356
## monthly_gas_1km -3.721e-04 6.523e-04 -0.571 0.568376
## active_1km 1.370e-02 5.147e-02 0.266 0.790046
## elevation -1.032e-01 6.119e-02 -1.686 0.091878 .
## EVI -3.506e+00 8.125e-01 -4.315 1.66e-05 ***
## num_odor_complaints 1.538e-01 1.550e-01 0.992 0.321095
## I(1/dist_dc^2) -7.080e-05 1.720e-05 -4.117 3.97e-05 ***
## avg_temp 1.721e-02 2.313e-02 0.744 0.456876
## avg_hum -2.319e-02 6.853e-03 -3.384 0.000726 ***
## precip 4.843e-01 4.540e-01 1.067 0.286252
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.047 8.00 0.236
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 6.913 7.57 8.877
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 3.511 77.00 0.103
## p-value
## s(as.numeric(month)) 0.09437 .
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 0.00963 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 116/118
## R-sq.(adj) = 0.144 Deviance explained = 15.5%
## GCV = 20.074 Scale est. = 19.797 n = 2431
plot(h2s_dm_model_e)



e <- getViz(h2s_dm_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_f <- gam(H2S_daily_max ~ as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
capacity + daily_downwind_wrp +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))
summary(h2s_dm_model_f)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ as.character(weekday) + wd_avg + ws_avg + daily_downwind_ref +
## I(1/dist_wrp^2) + capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km +
## active_1km + elevation + EVI + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.502e+00 1.301e+00 -2.693 0.00725 **
## as.character(weekday)Mon 3.939e+00 2.581e+01 0.153 0.87873
## as.character(weekday)Sat -1.275e+00 2.497e+01 -0.051 0.95930
## as.character(weekday)Sun 6.518e+00 2.516e+01 0.259 0.79561
## as.character(weekday)Thu 1.072e+01 2.548e+01 0.421 0.67406
## as.character(weekday)Tue -1.204e+01 2.627e+01 -0.458 0.64683
## as.character(weekday)Wed 9.587e+00 2.591e+01 0.370 0.71144
## wd_avg 7.981e-02 8.258e-02 0.966 0.33411
## ws_avg 1.029e-01 6.617e+00 0.016 0.98760
## daily_downwind_ref -3.077e+01 2.440e+01 -1.261 0.20754
## I(1/dist_wrp^2) -1.388e-03 1.470e-04 -9.442 < 2e-16 ***
## capacity 6.856e+00 7.105e-01 9.650 < 2e-16 ***
## daily_downwind_wrp 8.300e+00 2.955e+01 0.281 0.77891
## monthly_oil_1km 6.095e-02 1.223e-01 0.498 0.61833
## monthly_gas_1km 3.919e-03 3.281e-01 0.012 0.99047
## active_1km -1.056e+02 3.923e+01 -2.693 0.00725 **
## elevation 2.452e+01 7.897e+00 3.104 0.00198 **
## EVI 8.040e+02 1.046e+02 7.689 4.69e-14 ***
## num_odor_complaints 3.663e+00 8.699e-01 4.211 2.85e-05 ***
## I(1/dist_dc^2) 9.637e+00 9.958e-01 9.678 < 2e-16 ***
## avg_temp 2.544e+00 2.454e+00 1.037 0.30029
## avg_hum 2.180e-01 5.860e-01 0.372 0.71001
## precip -6.490e+00 2.637e+01 -0.246 0.80566
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.929 8.99 12.18
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.052 80.00 4.36
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 109/112
## R-sq.(adj) = 0.511 Deviance explained = 55.8%
## GCV = 37085 Scale est. = 33454 n = 827
plot(h2s_dm_model_f)


f <- getViz(h2s_dm_model_f)
plot(sm(f, 1)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_g <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
capacity + daily_downwind_wrp +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
summary(h2s_dm_model_g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km +
## elevation + EVI + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.549e+00 4.812e+00 1.985 0.047239 *
## year2021 -3.312e-01 6.114e-01 -0.542 0.588082
## year2022 -2.741e-01 8.979e-01 -0.305 0.760155
## year2023 -4.830e-01 1.051e+00 -0.459 0.645947
## as.character(weekday)Mon -3.089e-01 2.867e-01 -1.078 0.281286
## as.character(weekday)Sat -5.170e-01 2.867e-01 -1.803 0.071381 .
## as.character(weekday)Sun -8.541e-01 2.872e-01 -2.974 0.002948 **
## as.character(weekday)Thu -5.403e-02 2.869e-01 -0.188 0.850630
## as.character(weekday)Tue 2.851e-01 2.853e-01 0.999 0.317608
## as.character(weekday)Wed 6.704e-02 2.872e-01 0.233 0.815450
## wd_avg 1.637e-03 1.043e-03 1.570 0.116515
## ws_avg -2.778e-01 6.387e-02 -4.349 1.39e-05 ***
## daily_downwind_ref 1.678e-01 2.823e-01 0.595 0.552119
## I(1/dist_wrp^2) -2.894e-06 5.987e-07 -4.833 1.38e-06 ***
## capacity 3.271e-04 7.242e-03 0.045 0.963973
## daily_downwind_wrp -6.160e-02 3.434e-01 -0.179 0.857664
## monthly_oil_1km -1.339e-06 1.627e-04 -0.008 0.993434
## monthly_gas_1km 2.002e-04 6.188e-04 0.324 0.746321
## active_1km 2.663e-02 5.038e-02 0.529 0.597102
## elevation -2.393e-01 7.356e-02 -3.253 0.001147 **
## EVI -4.285e+00 1.114e+00 -3.846 0.000121 ***
## num_odor_complaints 3.875e-01 1.302e-01 2.975 0.002939 **
## I(1/dist_dc^2) 2.339e-04 2.144e-03 0.109 0.913132
## avg_temp -6.570e-03 2.333e-02 -0.282 0.778232
## avg_hum -2.571e-02 6.760e-03 -3.804 0.000144 ***
## precip 2.603e-01 5.213e-01 0.499 0.617515
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 5.201 8.000 1.089
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 7.851 8.085 6.333
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 15.404 80.000 0.458
## p-value
## s(as.numeric(month)) 0.082687 .
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 0.000162 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/123
## R-sq.(adj) = 0.0639 Deviance explained = 7.21%
## GCV = 34.93 Scale est. = 34.621 n = 5929
plot(h2s_dm_model_g)



g <- getViz(h2s_dm_model_g)
plot(sm(g, 2)) + l_fitRaster() + l_fitContour() + l_points()

Daily Average
# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
Converted_Angle +
s(monitor_lon, monitor_lat, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) +
## capacity + Converted_Angle + s(monitor_lon, monitor_lat,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.241e+00 6.271e-01 -8.357 < 2e-16 ***
## year2023 -1.050e-01 5.903e-02 -1.779 0.07532 .
## as.factor(weekday).L 7.927e-02 2.550e-02 3.109 0.00190 **
## as.factor(weekday).Q -1.710e-01 2.550e-02 -6.705 2.49e-11 ***
## as.factor(weekday).C 2.682e-02 2.531e-02 1.060 0.28935
## as.factor(weekday)^4 9.464e-04 2.526e-02 0.037 0.97011
## as.factor(weekday)^5 -5.869e-02 2.514e-02 -2.334 0.01967 *
## as.factor(weekday)^6 -4.824e-02 2.519e-02 -1.915 0.05563 .
## wd_avg 8.074e-04 1.160e-04 6.959 4.42e-12 ***
## ws_avg -1.194e-01 7.598e-03 -15.710 < 2e-16 ***
## daily_downwind_ref 2.721e-01 4.395e-02 6.190 7.05e-10 ***
## I(1/MinDist^2) 3.246e-04 3.119e-05 10.409 < 2e-16 ***
## I(1/dist_wrp^2) -1.728e-05 2.306e-06 -7.495 9.24e-14 ***
## capacity 1.668e-02 1.211e-03 13.774 < 2e-16 ***
## Converted_Angle -2.715e-03 1.050e-03 -2.585 0.00979 **
## monthly_oil_1km 5.069e-05 2.596e-05 1.952 0.05103 .
## monthly_gas_1km 2.193e-04 1.024e-04 2.141 0.03236 *
## active_1km -2.573e-02 1.059e-02 -2.429 0.01523 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.531 8 10.26 <2e-16 ***
## s(monitor_lon,monitor_lat) 8.987 9 73.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 33/35
## R-sq.(adj) = 0.395 Deviance explained = 40.3%
## GCV = 0.22365 Scale est. = 0.22066 n = 2431
plot(h2s_da_model_a)


a <- getViz(h2s_da_model_a)
plot(sm(a, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Look at daily average
h2s_da_model_b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
Converted_Angle +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) +
## capacity + Converted_Angle + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.658e+00 5.258e-01 -6.958 4.44e-12 ***
## year2023 -1.051e-01 5.903e-02 -1.780 0.0752 .
## as.factor(weekday).L 7.927e-02 2.550e-02 3.109 0.0019 **
## as.factor(weekday).Q -1.710e-01 2.550e-02 -6.705 2.50e-11 ***
## as.factor(weekday).C 2.682e-02 2.531e-02 1.060 0.2893
## as.factor(weekday)^4 9.340e-04 2.526e-02 0.037 0.9705
## as.factor(weekday)^5 -5.868e-02 2.514e-02 -2.334 0.0197 *
## as.factor(weekday)^6 -4.823e-02 2.519e-02 -1.914 0.0557 .
## wd_avg 8.074e-04 1.160e-04 6.958 4.43e-12 ***
## ws_avg -1.193e-01 7.597e-03 -15.704 < 2e-16 ***
## daily_downwind_ref 2.721e-01 4.395e-02 6.191 7.02e-10 ***
## I(1/MinDist^2) 1.464e-04 1.550e-05 9.449 < 2e-16 ***
## I(1/dist_wrp^2) -1.097e-05 1.110e-06 -9.881 < 2e-16 ***
## capacity 1.272e-02 9.049e-04 14.062 < 2e-16 ***
## Converted_Angle -2.584e-03 1.032e-03 -2.505 0.0123 *
## monthly_oil_1km 5.069e-05 2.596e-05 1.952 0.0510 .
## monthly_gas_1km 2.194e-04 1.024e-04 2.142 0.0323 *
## active_1km -2.574e-02 1.059e-02 -2.430 0.0152 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.531 8 10.26 <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.980 9 73.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 33/35
## R-sq.(adj) = 0.395 Deviance explained = 40.3%
## GCV = 0.22365 Scale est. = 0.22066 n = 2431
plot(h2s_da_model_b)


b <- getViz(h2s_da_model_b)
plot(sm(b, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_c <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## monthly_oil_1km + monthly_gas_1km + active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.449e+00 4.264e-01 -10.432 < 2e-16 ***
## year2023 -1.035e-01 5.908e-02 -1.752 0.07997 .
## as.factor(weekday).L 7.916e-02 2.553e-02 3.101 0.00195 **
## as.factor(weekday).Q -1.711e-01 2.553e-02 -6.700 2.58e-11 ***
## as.factor(weekday).C 2.689e-02 2.534e-02 1.061 0.28862
## as.factor(weekday)^4 1.225e-03 2.528e-02 0.048 0.96137
## as.factor(weekday)^5 -5.902e-02 2.517e-02 -2.345 0.01910 *
## as.factor(weekday)^6 -4.850e-02 2.522e-02 -1.923 0.05457 .
## wd_avg 8.020e-04 1.161e-04 6.906 6.35e-12 ***
## ws_avg -1.206e-01 7.591e-03 -15.883 < 2e-16 ***
## daily_downwind_ref 2.709e-01 4.400e-02 6.156 8.70e-10 ***
## capacity 1.334e-02 8.779e-04 15.194 < 2e-16 ***
## I(1/dist_wrp^2) -1.345e-05 1.080e-06 -12.454 < 2e-16 ***
## monthly_oil_1km 5.039e-05 2.598e-05 1.939 0.05257 .
## monthly_gas_1km 2.202e-04 1.025e-04 2.148 0.03182 *
## active_1km -2.551e-02 1.060e-02 -2.406 0.01620 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.528 8 10.16 <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.991 9 78.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 32/33
## R-sq.(adj) = 0.394 Deviance explained = 40.1%
## GCV = 0.22402 Scale est. = 0.22112 n = 2431
plot(h2s_da_model_c)


c <- getViz(h2s_da_model_c)
plot(sm(c, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_d <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## monthly_oil_1km + monthly_gas_1km + active_1km + daily_downwind_wrp +
## elevation + EVI
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.112e+00 7.526e-01 1.478 0.13948
## year2023 -1.057e-01 5.901e-02 -1.792 0.07328 .
## as.character(weekday)Mon -8.975e-02 3.557e-02 -2.523 0.01169 *
## as.character(weekday)Sat -9.870e-02 3.615e-02 -2.730 0.00637 **
## as.character(weekday)Sun -1.985e-01 3.594e-02 -5.522 3.72e-08 ***
## as.character(weekday)Thu -5.020e-02 3.597e-02 -1.396 0.16289
## as.character(weekday)Tue 6.448e-03 3.539e-02 0.182 0.85543
## as.character(weekday)Wed 5.342e-02 3.590e-02 1.488 0.13694
## wd_avg 8.069e-04 1.160e-04 6.955 4.54e-12 ***
## ws_avg -1.197e-01 7.601e-03 -15.753 < 2e-16 ***
## daily_downwind_ref 2.736e-01 4.391e-02 6.231 5.45e-10 ***
## capacity 9.779e-04 1.519e-03 0.644 0.51975
## I(1/dist_wrp^2) -1.370e-07 5.649e-08 -2.426 0.01536 *
## monthly_oil_1km 5.052e-05 2.595e-05 1.947 0.05166 .
## monthly_gas_1km 2.217e-04 1.024e-04 2.165 0.03050 *
## active_1km -2.590e-02 1.059e-02 -2.446 0.01451 *
## daily_downwind_wrp 5.198e-02 4.576e-02 1.136 0.25617
## elevation -1.643e-02 8.572e-03 -1.917 0.05540 .
## EVI -1.137e+00 1.537e-01 -7.399 1.88e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.525 8.000 10.29 <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 7.917 7.997 43.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 35/36
## R-sq.(adj) = 0.395 Deviance explained = 40.3%
## GCV = 0.22371 Scale est. = 0.22063 n = 2431
plot(h2s_da_model_d)


d <- getViz(h2s_da_model_d)
plot(sm(d, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_e <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_e)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.340e+00 3.648e-01 -3.673 0.000245 ***
## year2023 2.022e-01 1.393e-01 1.451 0.146918
## as.character(weekday)Mon -8.888e-02 2.901e-02 -3.063 0.002216 **
## as.character(weekday)Sat -9.492e-02 2.944e-02 -3.224 0.001283 **
## as.character(weekday)Sun -1.983e-01 2.931e-02 -6.767 1.66e-11 ***
## as.character(weekday)Thu -4.763e-02 2.934e-02 -1.623 0.104656
## as.character(weekday)Tue -2.590e-04 2.890e-02 -0.009 0.992851
## as.character(weekday)Wed 4.657e-02 2.927e-02 1.591 0.111728
## wd_avg 6.755e-04 9.969e-05 6.777 1.55e-11 ***
## ws_avg -1.099e-01 6.457e-03 -17.027 < 2e-16 ***
## daily_downwind_ref 2.112e-01 3.650e-02 5.786 8.17e-09 ***
## capacity 6.672e-03 7.272e-04 9.174 < 2e-16 ***
## I(1/dist_wrp^2) 2.738e-07 1.034e-07 2.648 0.008160 **
## monthly_oil_1km 6.383e-05 4.342e-05 1.470 0.141732
## monthly_gas_1km -3.474e-05 2.298e-04 -0.151 0.879834
## active_1km -6.203e-03 1.696e-02 -0.366 0.714536
## daily_downwind_wrp 5.572e-02 3.797e-02 1.467 0.142417
## elevation -4.655e-02 1.040e-02 -4.475 8.00e-06 ***
## EVI -1.023e+00 1.397e-01 -7.319 3.43e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.446 8.000 3.826
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.471 8.471 24.499
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.663 77.000 16.791
## p-value
## s(as.numeric(month)) 7.44e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/113
## R-sq.(adj) = 0.6 Deviance explained = 61.8%
## GCV = 0.15284 Scale est. = 0.14598 n = 2431
plot(h2s_da_model_e)



e <- getViz(h2s_da_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
#
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Since feb 2022
h2s_da_model_f <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_f)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum +
## precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.444e+00 3.800e-01 -3.799 0.000149 ***
## year2023 1.574e-01 1.394e-01 1.129 0.259160
## as.character(weekday)Mon -8.921e-02 2.785e-02 -3.203 0.001376 **
## as.character(weekday)Sat -9.195e-02 2.820e-02 -3.261 0.001128 **
## as.character(weekday)Sun -2.158e-01 2.825e-02 -7.638 3.20e-14 ***
## as.character(weekday)Thu -4.794e-02 2.810e-02 -1.706 0.088066 .
## as.character(weekday)Tue -2.705e-03 2.770e-02 -0.098 0.922221
## as.character(weekday)Wed 3.146e-02 2.809e-02 1.120 0.262844
## wd_avg 3.607e-04 9.868e-05 3.655 0.000263 ***
## ws_avg -8.212e-02 6.477e-03 -12.679 < 2e-16 ***
## daily_downwind_ref 1.567e-01 3.519e-02 4.454 8.84e-06 ***
## capacity 6.189e-03 7.279e-04 8.503 < 2e-16 ***
## I(1/dist_wrp^2) 2.902e-07 9.986e-08 2.907 0.003687 **
## monthly_oil_1km 1.470e-04 4.418e-05 3.327 0.000891 ***
## monthly_gas_1km -3.235e-04 2.245e-04 -1.441 0.149619
## active_1km -2.002e-02 1.695e-02 -1.181 0.237681
## daily_downwind_wrp 7.073e-02 3.636e-02 1.945 0.051897 .
## elevation -4.407e-02 9.975e-03 -4.418 1.04e-05 ***
## EVI -9.561e-01 1.355e-01 -7.059 2.21e-12 ***
## num_odor_complaints 2.644e-02 1.318e-02 2.006 0.044921 *
## I(1/dist_dc^2) 9.134e-05 8.252e-06 11.069 < 2e-16 ***
## avg_temp 8.334e-03 2.861e-03 2.913 0.003618 **
## avg_hum -6.523e-03 7.504e-04 -8.694 < 2e-16 ***
## precip -5.642e-02 3.864e-02 -1.460 0.144353
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.719 8.000 4.319
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.471 8.471 24.628
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.673 77.000 18.676
## p-value
## s(as.numeric(month)) 2.29e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 115/118
## R-sq.(adj) = 0.634 Deviance explained = 65.1%
## GCV = 0.14014 Scale est. = 0.1336 n = 2431
plot(h2s_da_model_f)



f <- getViz(h2s_da_model_f)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
#
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Disaster only
h2s_da_model_g <- gam(H2S_daily_avg ~ month + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))
summary(h2s_da_model_g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ month + as.character(weekday) + wd_avg + ws_avg +
## daily_downwind_ref + capacity + I(1/dist_wrp^2) + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km +
## active_1km + daily_downwind_wrp + elevation + EVI + num_odor_complaints +
## I(1/dist_dc^2) + avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2557355 0.1054716 -2.425 0.01556 *
## month11 -4.7575720 1.9622093 -2.425 0.01556 *
## month12 1.2193191 0.5029553 2.424 0.01557 *
## as.character(weekday)Mon -2.7259172 2.9255211 -0.932 0.35176
## as.character(weekday)Sat -2.8891542 2.8298781 -1.021 0.30761
## as.character(weekday)Sun -2.4956249 2.8514127 -0.875 0.38173
## as.character(weekday)Thu -0.3339398 2.8872365 -0.116 0.90795
## as.character(weekday)Tue -4.5461381 2.9772550 -1.527 0.12720
## as.character(weekday)Wed -1.2618609 2.9362292 -0.430 0.66750
## wd_avg 0.0080694 0.0093628 0.862 0.38904
## ws_avg -0.0079586 0.7510760 -0.011 0.99155
## daily_downwind_ref -2.7311456 2.7658732 -0.987 0.32375
## capacity 0.7818620 0.0804573 9.718 < 2e-16 ***
## I(1/dist_wrp^2) -0.0001603 0.0000167 -9.600 < 2e-16 ***
## monthly_oil_1km -0.0069716 0.0099354 -0.702 0.48309
## monthly_gas_1km 0.0191159 0.0350103 0.546 0.58522
## active_1km -7.7141096 3.1814972 -2.425 0.01556 *
## daily_downwind_wrp -0.8402903 3.3511018 -0.251 0.80208
## elevation 2.7201925 0.8943859 3.041 0.00244 **
## EVI 90.9526595 11.8445967 7.679 5.05e-14 ***
## num_odor_complaints 0.5226646 0.0988284 5.289 1.62e-07 ***
## I(1/dist_dc^2) 1.1056957 0.1127784 9.804 < 2e-16 ***
## avg_temp 0.3827439 0.2785720 1.374 0.16987
## avg_hum 0.0272100 0.0665065 0.409 0.68256
## precip -2.5913331 2.9921008 -0.866 0.38674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.926 8.989 12.392
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.823 80.000 3.456
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 109/114
## R-sq.(adj) = 0.501 Deviance explained = 55%
## GCV = 476.76 Scale est. = 429.63 n = 827
plot(h2s_da_model_g)


g <- getViz(h2s_da_model_g)
plot(sm(g, 1)) + l_fitRaster() + l_fitContour() + l_points()

# Exclude disaster
h2s_da_model_h <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
summary(h2s_da_model_h)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum +
## precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.211e+00 1.292e+00 -4.034 5.56e-05 ***
## year2021 3.980e-01 1.063e-01 3.745 0.000182 ***
## year2022 -5.043e-01 1.975e-01 -2.553 0.010690 *
## year2023 -5.863e-01 1.951e-01 -3.006 0.002662 **
## as.character(weekday)Mon -5.663e-02 2.403e-02 -2.356 0.018491 *
## as.character(weekday)Sat -8.496e-02 2.400e-02 -3.539 0.000404 ***
## as.character(weekday)Sun -1.725e-01 2.406e-02 -7.170 8.41e-13 ***
## as.character(weekday)Thu -1.198e-02 2.402e-02 -0.499 0.617931
## as.character(weekday)Tue -1.297e-02 2.391e-02 -0.542 0.587578
## as.character(weekday)Wed 1.436e-02 2.407e-02 0.596 0.550993
## wd_avg 1.596e-04 8.894e-05 1.795 0.072754 .
## ws_avg -7.186e-02 5.614e-03 -12.801 < 2e-16 ***
## daily_downwind_ref 1.371e-02 2.425e-02 0.565 0.571907
## capacity 1.852e-02 2.451e-03 7.557 4.77e-14 ***
## I(1/dist_wrp^2) -7.037e-06 2.159e-06 -3.260 0.001122 **
## monthly_oil_1km -2.609e-05 1.723e-05 -1.514 0.130036
## monthly_gas_1km -1.911e-04 6.998e-05 -2.730 0.006344 **
## active_1km -3.876e-02 7.232e-03 -5.360 8.63e-08 ***
## daily_downwind_wrp 3.779e-02 2.907e-02 1.300 0.193671
## elevation 6.682e-02 1.179e-02 5.665 1.54e-08 ***
## EVI 8.998e-01 2.927e-01 3.074 0.002122 **
## num_odor_complaints 2.776e-02 1.131e-02 2.454 0.014152 *
## I(1/dist_dc^2) 5.424e-02 1.576e-02 3.442 0.000582 ***
## avg_temp 5.816e-03 2.110e-03 2.757 0.005851 **
## avg_hum -6.005e-03 5.943e-04 -10.106 < 2e-16 ***
## precip -3.502e-02 4.400e-02 -0.796 0.426051
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.898 8 41.37
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 26.18
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 76.137 80 23.23
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/123
## R-sq.(adj) = 0.454 Deviance explained = 46.5%
## GCV = 0.24724 Scale est. = 0.24236 n = 5929
plot(h2s_da_model_h)



h <- getViz(h2s_da_model_h)
plot(sm(h, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Disaster indicator
h2s_da_model_i <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip + disaster,
data = daily_full %>% mutate(disaster = if_else(year == '2021',
month %in% c('10', '11', '12'), 1, 0)))
summary(h2s_da_model_i)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum +
## precip + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.345e+02 1.384e+01 -24.166 < 2e-16 ***
## year2021 3.786e+00 1.773e+00 2.135 0.03276 *
## year2022 3.927e+00 2.212e+00 1.775 0.07589 .
## year2023 1.876e+00 2.543e+00 0.738 0.46077
## as.character(weekday)Mon -3.304e-01 3.918e-01 -0.843 0.39914
## as.character(weekday)Sat -2.755e-01 3.907e-01 -0.705 0.48077
## as.character(weekday)Sun -3.018e-01 3.908e-01 -0.772 0.44001
## as.character(weekday)Thu -3.277e-01 3.912e-01 -0.838 0.40225
## as.character(weekday)Tue -6.803e-01 3.901e-01 -1.744 0.08123 .
## as.character(weekday)Wed -3.763e-01 3.915e-01 -0.961 0.33648
## wd_avg 1.342e-03 1.408e-03 0.954 0.34022
## ws_avg -7.416e-02 9.036e-02 -0.821 0.41185
## daily_downwind_ref -6.152e-02 3.919e-01 -0.157 0.87528
## capacity 6.528e-01 2.477e-02 26.352 < 2e-16 ***
## I(1/dist_wrp^2) -2.505e-04 1.063e-05 -23.576 < 2e-16 ***
## monthly_oil_1km 2.888e-04 2.638e-04 1.095 0.27359
## monthly_gas_1km 6.262e-04 1.004e-03 0.624 0.53293
## active_1km 2.737e-02 9.507e-02 0.288 0.77341
## daily_downwind_wrp 4.122e-01 4.694e-01 0.878 0.38000
## elevation 2.384e+00 1.499e-01 15.904 < 2e-16 ***
## EVI 7.512e+01 3.050e+00 24.632 < 2e-16 ***
## num_odor_complaints 1.000e+00 2.806e-02 35.645 < 2e-16 ***
## I(1/dist_dc^2) 1.932e+00 7.865e-02 24.569 < 2e-16 ***
## avg_temp 2.012e-02 3.278e-02 0.614 0.53943
## avg_hum -1.387e-04 8.989e-03 -0.015 0.98769
## precip -2.459e-01 6.049e-01 -0.406 0.68440
## disaster 3.501e+00 1.127e+00 3.105 0.00191 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 4.41 8 1.537
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00 9 74.183
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 66.07 80 4.241
## p-value
## s(as.numeric(month)) 0.00666 **
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/124
## R-sq.(adj) = 0.32 Deviance explained = 33%
## GCV = 74.332 Scale est. = 73.182 n = 6756
plot(h2s_da_model_i)



i <- getViz(h2s_da_model_i)
plot(sm(i, 2)) + l_fitRaster() + l_fitContour() + l_points()

plotRGL(sm(i, 2), fix = c(`as.numeric(day)` = 0), residuals = F)
# try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_i), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Everything
h2s_da_model_j <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full)
summary(h2s_da_model_j)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum +
## precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.317e+02 1.379e+01 -24.050 <2e-16 ***
## year2021 7.002e-01 1.447e+00 0.484 0.6285
## year2022 7.250e-01 1.920e+00 0.378 0.7057
## year2023 -7.798e-01 2.332e+00 -0.334 0.7381
## as.character(weekday)Mon -3.531e-01 3.920e-01 -0.901 0.3677
## as.character(weekday)Sat -2.631e-01 3.909e-01 -0.673 0.5009
## as.character(weekday)Sun -3.038e-01 3.911e-01 -0.777 0.4373
## as.character(weekday)Thu -3.481e-01 3.914e-01 -0.889 0.3739
## as.character(weekday)Tue -7.030e-01 3.903e-01 -1.801 0.0717 .
## as.character(weekday)Wed -3.883e-01 3.917e-01 -0.991 0.3217
## wd_avg 1.313e-03 1.408e-03 0.933 0.3510
## ws_avg -6.694e-02 9.034e-02 -0.741 0.4587
## daily_downwind_ref -8.039e-02 3.921e-01 -0.205 0.8376
## capacity 6.535e-01 2.475e-02 26.404 <2e-16 ***
## I(1/dist_wrp^2) -2.516e-04 1.055e-05 -23.849 <2e-16 ***
## monthly_oil_1km 3.635e-04 2.615e-04 1.390 0.1646
## monthly_gas_1km 5.075e-04 9.945e-04 0.510 0.6099
## active_1km 3.354e-02 9.390e-02 0.357 0.7210
## daily_downwind_wrp 3.977e-01 4.697e-01 0.847 0.3972
## elevation 2.388e+00 1.499e-01 15.930 <2e-16 ***
## EVI 7.526e+01 3.046e+00 24.712 <2e-16 ***
## num_odor_complaints 1.019e+00 2.746e-02 37.121 <2e-16 ***
## I(1/dist_dc^2) 1.934e+00 7.828e-02 24.705 <2e-16 ***
## avg_temp 2.961e-02 3.264e-02 0.907 0.3643
## avg_hum -1.644e-04 8.979e-03 -0.018 0.9854
## precip -2.784e-01 6.048e-01 -0.460 0.6453
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 4.10 8 1.261
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00 9 74.816
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 65.67 80 4.096
## p-value
## s(as.numeric(month)) 0.0158 *
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/123
## R-sq.(adj) = 0.319 Deviance explained = 32.9%
## GCV = 74.425 Scale est. = 73.293 n = 6756
plot(h2s_da_model_j)



knitr::kable(tibble(Model = c('Since Feb 2022',
'Disaster Only',
'Exclude Disaster',
'Everything w Disaster Indicator',
'Everything w.o Disaster Indicator'),
'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
round(summary(h2s_da_model_g)$r.sq, 2),
round(summary(h2s_da_model_h)$r.sq, 2),
round(summary(h2s_da_model_i)$r.sq, 2),
round(summary(h2s_da_model_j)$r.sq, 2))))
| Since Feb 2022 |
0.63 |
| Disaster Only |
0.50 |
| Exclude Disaster |
0.45 |
| Everything w Disaster Indicator |
0.32 |
| Everything w.o Disaster Indicator |
0.32 |
80/20 Cross Validation on Daily Average models
result <- tibble(Model = character(),
'80/20 Train R-Sq' = numeric(),
'80/20 Test R-Sq' = numeric())
predictors <- c('H2S_daily_avg', 'month', 'year', 'weekday', 'wd_avg', 'ws_avg', 'daily_downwind_ref', 'dist_wrp',
'mon_utm_x', 'mon_utm_y', 'day', 'monthly_oil_1km', 'monthly_gas_1km',
'active_1km', 'daily_downwind_wrp', 'elevation', 'EVI', 'num_odor_complaints',
'dist_dc', 'capacity', 'avg_temp', 'avg_hum', 'precip')
set.seed(313)
# model 1: since Feb 2022
train <- daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
sample_frac(0.8)
test <- anti_join(daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f2 <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f2, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2)$n - 1)/
(summary(h2s_da_model_f2)$n - summary(h2s_da_model_f2)$np - 1)
result <- rbind(result, tibble(Model = 'Since Feb 2022',
'80/20 Train R-Sq' = summary(h2s_da_model_f2)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
f2_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 GAM') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
# Model 2: Disaster (Oct-Dec 2021) only
train <- daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
sample_frac(0.8)
test <- anti_join(daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f3 <- gam(H2S_daily_avg~as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f3, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3)$n - 1)/
(summary(h2s_da_model_f3)$n - summary(h2s_da_model_f3)$np - 1)
result <- rbind(result, tibble(Model = 'Disaster Only',
'80/20 Train R-Sq' = summary(h2s_da_model_f3)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
f3_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only GAM') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
# Model 3: Excluding Disaster
train <- daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
sample_frac(0.8)
test <- anti_join(daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f4 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f4, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4)$n - 1)/
(summary(h2s_da_model_f4)$n - summary(h2s_da_model_f4)$np - 1)
result <- rbind(result, tibble(Model = 'Exclude Disaster',
'80/20 Train R-Sq' = summary(h2s_da_model_f4)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
# Model 4: Everything with Disaster Indicator
train <- daily_full %>%
select(all_of(predictors)) %>%
mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>%
filter(complete.cases(.))
train <- rbind(
train %>% filter(disaster==1) %>% sample_frac(0.8),
train %>% filter(disaster==0) %>% sample_frac(0.8)
)
test <- anti_join(daily_full %>%
select(all_of(predictors)) %>%
mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip, disaster)`
h2s_da_model_f5 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
disaster +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f5, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5)$n - 1)/
(summary(h2s_da_model_f5)$n - summary(h2s_da_model_f5)$np - 1)
result <- rbind(result, tibble(Model = 'Everything w. Disaster Indicator',
'80/20 Train R-Sq' = summary(h2s_da_model_f5)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
# Model 5: Everything
train <- daily_full %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
sample_frac(0.8)
train <- rbind(
train %>% filter(year == '2021' & month %in% c('10', '11', '12')) %>% sample_frac(0.8),
train %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% sample_frac(0.8)
)
test <- anti_join(daily_full %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f6 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f6, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6)$n - 1)/
(summary(h2s_da_model_f5)$n - summary(h2s_da_model_f6)$np - 1)
result <- rbind(result, tibble(Model = 'Everything w.o Disaster Indicator',
'80/20 Train R-Sq' = summary(h2s_da_model_f6)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
f6_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator Gam') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
f6_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
xlim(NA, 15) + ylim(NA, 15) +
theme_bw()
knitr::kable(tibble(Model = c('Since Feb 2022',
'Disaster Only',
'Exclude Disaster',
'Everything w Disaster Indicator',
'Everything w.o Disaster Indicator'),
'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
round(summary(h2s_da_model_g)$r.sq, 2),
round(summary(h2s_da_model_h)$r.sq, 2),
round(summary(h2s_da_model_i)$r.sq, 2),
round(summary(h2s_da_model_j)$r.sq, 2))) %>%
left_join(result, join_by(Model)))
| Since Feb 2022 |
0.63 |
0.6221825 |
0.6290050 |
| Disaster Only |
0.50 |
0.5148863 |
-0.1386304 |
| Exclude Disaster |
0.45 |
0.4474583 |
0.4523375 |
| Everything w Disaster Indicator |
0.32 |
NA |
NA |
| Everything w.o Disaster Indicator |
0.32 |
0.3812678 |
0.4193177 |
Cross Validation on each monitor
result_cv <- tibble(Model = character(),
'CV Avg Train R-Sq' = numeric(),
'CV Avg Test R-Sq' = numeric())
# model 1: since Feb 2022
post2022feb <- daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(post2022feb$Monitor)
cv1_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- post2022feb %>%
filter(Monitor != monitors[i])
test <- post2022feb %>%
filter(Monitor == monitors[i])
h2s_da_model_f2b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f2b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
(summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
cv1_result <- rbind(cv1_result, tibble(monitors[i],
summary(h2s_da_model_f2b)$r.sq,
summary(h2s_da_model_f2b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Since Feb 2022',
'CV Avg Train R-Sq' = mean(cv1_result$`summary(h2s_da_model_f2b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv1_result$adj_r2_test))))
# model 2: Disaster only
disaster <- daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(disaster$Monitor)
cv2_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- disaster %>%
filter(Monitor != monitors[i])
test <- disaster %>%
filter(Monitor == monitors[i])
h2s_da_model_f3b <- gam(H2S_daily_avg~ month + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f3b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3b)$n - 1)/
(summary(h2s_da_model_f3b)$n - summary(h2s_da_model_f3b)$np - 1)
cv2_result <- rbind(cv2_result, tibble(monitors[i],
summary(h2s_da_model_f3b)$r.sq,
summary(h2s_da_model_f3b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Disaster Only',
'CV Avg Train R-Sq' = mean(cv2_result$`summary(h2s_da_model_f3b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv2_result$adj_r2_test))))
# model 3: Exclude Disaster
exclude_disaster <- daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(exclude_disaster$Monitor)
cv3_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- exclude_disaster %>%
filter(Monitor != monitors[i])
test <- exclude_disaster %>%
filter(Monitor == monitors[i])
h2s_da_model_f4b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f4b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4b)$n - 1)/
(summary(h2s_da_model_f4b)$n - summary(h2s_da_model_f4b)$np - 1)
cv3_result <- rbind(cv3_result, tibble(monitors[i],
summary(h2s_da_model_f4b)$r.sq,
summary(h2s_da_model_f4b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Exclude Disaster',
'CV Avg Train R-Sq' = mean(cv3_result$`summary(h2s_da_model_f4b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv3_result$adj_r2_test))))
# model 4: Everything w Disaster Indicator
full_complete <- daily_full %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.)) %>%
mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0))
monitors <- unique(full_complete$Monitor)
cv4_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- full_complete %>%
filter(Monitor != monitors[i])
test <- full_complete %>%
filter(Monitor == monitors[i])
h2s_da_model_f5b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
disaster + capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f5b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5b)$n - 1)/
(summary(h2s_da_model_f5b)$n - summary(h2s_da_model_f5b)$np - 1)
cv4_result <- rbind(cv4_result, tibble(monitors[i],
summary(h2s_da_model_f5b)$r.sq,
summary(h2s_da_model_f5b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Everything w Disaster Indicator',
'CV Avg Train R-Sq' = mean(cv4_result$`summary(h2s_da_model_f5b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv4_result$adj_r2_test))))
# model 5: Everything w.o Disaster Indicator
full_complete <- daily_full %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(full_complete$Monitor)
cv5_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- full_complete %>%
filter(Monitor != monitors[i])
test <- full_complete %>%
filter(Monitor == monitors[i])
h2s_da_model_f6b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f6b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6b)$n - 1)/
(summary(h2s_da_model_f6b)$n - summary(h2s_da_model_f6b)$np - 1)
cv5_result <- rbind(cv5_result, tibble(monitors[i],
summary(h2s_da_model_f6b)$r.sq,
summary(h2s_da_model_f6b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Everything w.o Disaster Indicator',
'CV Avg Train R-Sq' = mean(cv5_result$`summary(h2s_da_model_f6b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv5_result$adj_r2_test))))
cv1_result
cv2_result
cv3_result
cv4_result
cv5_result
XGBoost Since 2022 Feb
xgb_result <- tibble(Model = character(),
'R-Sq' = numeric(),
'80/20 Train R-Sq' = numeric(),
'80/20 Test R-Sq' = numeric())
daily_full <- daily_full %>%
mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
Monitor = str_replace_all(Monitor, ' ', '_'),
weekday = as.character(weekday),
daily_downwind_ref = as.integer(daily_downwind_ref),
daily_downwind_wrp = as.integer(daily_downwind_wrp))
train <- daily_full[complete.cases(daily_full),] %>%
filter(day >= '2022-02-01')
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
max_depth = c(3, 4, 5),
eta = c(0.1, 0.3),
gamma = c(0.01, 0.001),
colsample_bytree = c(0.5, 1),
min_child_weight = 0,
subsample = c(0.5, 0.75, 1))
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv",
number=10,
verboseIter=TRUE,
search='grid')
fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
fit.xgb_da$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.2 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 42
## niter: 500
## nfeatures : 42
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 78 500 5 0.1 0.001 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da$finalModel, top_n = 20)

80/20 CV
set.seed(313)
train <- daily_full[complete.cases(daily_full),] %>%
filter(day >= '2022-02-01') %>%
sample_frac(0.8)
test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(day >= '2022-02-01'),
train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
test <- fastDummies::dummy_cols(test %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
max_depth = c(3, 4, 5),
eta = c(0.1, 0.3),
gamma = c(0.01, 0.001),
colsample_bytree = c(0.5, 1),
min_child_weight = 0,
subsample = c(0.5, 0.75, 1))
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv",
number=10,
verboseIter=TRUE,
search='grid')
fit.xgb_da_8020 <- readRDS('rfiles/fit.xgb_da_8020.rds')
# fit.xgb_da_8020 <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_8020, 'rfiles/fit.xgb_da_8020.rds')
getTrainPerf(fit.xgb_da_8020)
# Test statistics
predictions <- predict(fit.xgb_da_8020$finalModel,
newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions, test %>% pull(H2S_daily_avg))
test_stat
## RMSE Rsquared MAE
## 0.2871070 0.7610102 0.1862030
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
'R-Sq' = getTrainPerf(fit.xgb_da)$TrainRsquared,
'80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_8020)$TrainRsquared,
'80/20 Test R-Sq' = test_stat[[2]]))
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
fit.xgb_da_8020$finalModel
## ##### xgb.Booster
## raw: 1.2 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## # of features: 42
## niter: 500
## nfeatures : 42
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96 500 5 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_8020$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in sqrt(sum.squares/one.delta): NaNs produced

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_8020$finalModel, top_n = 20)

CV by leaving each monitor out once
post2022feb <- daily_full %>%
filter(day >= '2022-02-01')
monitor_names <- unique(post2022feb$Monitor)
# for (i in 1:length(monitor_names)) {
# train <- post2022feb %>%
# filter(Monitor != monitor_names[i])
#
# train <- train[complete.cases(train),]
#
# train <- fastDummies::dummy_cols(train %>%
# select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
# monitor_lat, monitor_lon, county, dist_213)) %>%
# mutate(MinDist = 1/(MinDist^2),
# dist_wrp = 1/(dist_wrp^2)),
# remove_selected_columns = TRUE)
#
# fit.xgb_da <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
#
# saveRDS(fit.xgb_da, paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
# }
model_stats <- tibble(Monitor = character(), train_r2 = numeric(), test_r2 = numeric())
add_cols <- function(df, cols) {
add <- cols[!cols %in% names(df)]
if(length(add) !=0 ) df[add] <- NA
return(df)
}
for (i in setdiff(1:length(monitor_names), c(9))) {
fit.xgb_da <- readRDS(paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
test <- post2022feb %>%
filter(Monitor == monitor_names[i])
test <- fastDummies::dummy_cols(test[complete.cases(test),] %>%
ungroup() %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213)) %>%
add_cols(c('year_2022', 'year_2023', 'month_01', 'month_02',
'month_03', 'month_04', 'month_05', 'month_06',
'month_07', 'month_08', 'month_09', 'month_10',
'month_11', 'month_12')) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2),
year_2022 = ifelse(is.na(year_2022), 0, year_2022),
year_2023 = ifelse(is.na(year_2023), 0, year_2023),
month_01 = ifelse(is.na(month_01), 0, month_01),
month_02 = ifelse(is.na(month_02), 0, month_02),
month_03 = ifelse(is.na(month_03), 0, month_03),
month_04 = ifelse(is.na(month_04), 0, month_04),
month_05 = ifelse(is.na(month_05), 0, month_05),
month_06 = ifelse(is.na(month_06), 0, month_06),
month_07 = ifelse(is.na(month_07), 0, month_07),
month_08 = ifelse(is.na(month_08), 0, month_08),
month_09 = ifelse(is.na(month_09), 0, month_09),
month_10 = ifelse(is.na(month_10), 0, month_10),
month_11 = ifelse(is.na(month_11), 0, month_11),
month_12 = ifelse(is.na(month_12), 0, month_12)),
remove_selected_columns = TRUE)
train_r2 <- getTrainPerf(fit.xgb_da)$TrainRsquared
predictions <- predict(fit.xgb_da$finalModel,
newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_r2 <- R2(test %>% pull(H2S_daily_avg), predictions)
model_stats <- rbind(model_stats, tibble(Monitor = monitor_names[i], train_r2, test_r2))
}
model_stats
tibble(Monitor = 'Total Average', train_r2 = mean(model_stats$train_r2, na.rm = T),
test_r2 = mean(model_stats$test_r2, na.rm = T))
XGBoost Disaster
Daily average model
train <- daily_full[complete.cases(daily_full),] %>%
filter(year == '2021', month %in% c('10', '11', '12'))
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
fit.xgb_da_dis <- readRDS('rfiles/fit.xgb_da_dis.rds')
# fit.xgb_da_dis <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis, 'rfiles/fit.xgb_da_dis.rds')
getTrainPerf(fit.xgb_da_dis)
fit.xgb_da_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 500
## nfeatures : 31
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96 500 5 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_dis$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_dis$finalModel, top_n = 20)

80/20 CV
set.seed(313)
train <- daily_full[complete.cases(daily_full),] %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
sample_frac(0.8)
test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(year == '2021', month %in% c('10', '11', '12')),
train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
test <- fastDummies::dummy_cols(test %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
fit.xgb_da_dis_8020 <- readRDS('rfiles/fit.xgb_da_dis_8020.rds')
# fit.xgb_da_dis_8020 <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis_8020, 'rfiles/fit.xgb_da_dis_8020.rds')
getTrainPerf(fit.xgb_da_dis_8020)
# Test statistics
predictions <- predict(fit.xgb_da_dis_8020$finalModel,
newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions,
test %>% pull(H2S_daily_avg))
test_stat
## RMSE Rsquared MAE
## 21.0148229 0.6897725 3.6093483
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
'R-Sq' = getTrainPerf(fit.xgb_da_dis)$TrainRsquared,
'80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_dis_8020)$TrainRsquared,
'80/20 Test R-Sq' = test_stat[[2]]))
xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only XGBoost') +
theme_bw()
xgb_dis_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
xlim(0, 10) + ylim(0, 10) +
theme_bw()
fit.xgb_da_dis_8020$finalModel
## ##### xgb.Booster
## raw: 209.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## # of features: 31
## niter: 100
## nfeatures : 31
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 205 100 5 0.3 0.01 0.5 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_dis_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_dis_8020$finalModel, top_n = 20)

XGBoost Full
Daily average model
train <- daily_full[complete.cases(daily_full),]
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
fit.xgb_da_full <- readRDS('rfiles/fit.xgb_da_full.rds')
# fit.xgb_da_full <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full, 'rfiles/fit.xgb_da_full.rds')
getTrainPerf(fit.xgb_da_full)
fit.xgb_da_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 114.2 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "3", gamma = "0.001", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.5", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 40
## niter: 100
## nfeatures : 40
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1 100 3 0.1 0.001 0.5 0 0.5
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_full$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_full$finalModel, top_n = 20)

80/20 CV
set.seed(313)
train <- rbind(
daily_full[complete.cases(daily_full),] %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
sample_frac(0.8),
daily_full[complete.cases(daily_full),] %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
sample_frac(0.8)
)
test <- anti_join(daily_full[complete.cases(daily_full),], train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
test <- fastDummies::dummy_cols(test %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
fit.xgb_da_full_8020 <- readRDS('rfiles/fit.xgb_da_full_8020.rds')
# fit.xgb_da_full_8020 <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full_8020, 'rfiles/fit.xgb_da_full_8020.rds')
getTrainPerf(fit.xgb_da_full_8020)
# Test statistics
predictions <- predict(fit.xgb_da_full_8020$finalModel,
newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions,
test %>% pull(H2S_daily_avg))
test_stat
## RMSE Rsquared MAE
## 5.5777177 0.7863225 0.5455890
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
'R-Sq' = getTrainPerf(fit.xgb_da_full)$TrainRsquared,
'80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_full_8020)$TrainRsquared,
'80/20 Test R-Sq' = test_stat[[2]]))
xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator XGBoost') +
theme_bw()
xgb_full_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
xlim(0, 10) + ylim(0, 10) +
theme_bw()
fit.xgb_da_full_8020$finalModel
## ##### xgb.Booster
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## # of features: 40
## niter: 500
## nfeatures : 40
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 99 500 5 0.1 0.01 0.5 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_full_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_full_8020$finalModel, top_n = 20)
